{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "news = fetch_20newsgroups()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import HashingVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "hv = HashingVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = hv.fit_transform(news.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = cosine_similarity(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(11314, 11314)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = pd.Series(data =res[:1][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       1.000000\n",
       "958     0.771061\n",
       "659     0.592881\n",
       "8013    0.584578\n",
       "5553    0.569541\n",
       "dtype: float64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d.sort_values(ascending=False)[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"From: lerxst@wam.umd.edu (where's my thing)\\nSubject: WHAT car is this!?\\nNntp-Posting-Host: rac3.wam.umd.edu\\nOrganization: University of Maryland, College Park\\nLines: 15\\n\\n I was wondering if anyone out there could enlighten me on this car I saw\\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\\nthe front bumper was separate from the rest of the body. This is \\nall I know. If anyone can tellme a model name, engine specs, years\\nof production, where this car is made, history, or whatever info you\\nhave on this funky looking car, please e-mail.\\n\\nThanks,\\n- IL\\n   ---- brought to you by your neighborhood Lerxst ----\\n\\n\\n\\n\\n\""
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news.data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'From: rkim@eecg.toronto.edu (Ryan Kim)\\nSubject: New break pads & exhausts after 96K km (60K mi) on \\'90 Maxima?\\nOrganization: CSRI, University of Toronto\\nLines: 43\\n\\n\\nHi, maybe someone can help me here...\\nI am looking to buy this 1990 Nissan Maxima GXE for CDN$14000 right now.\\nThe car has 96000 km (or about 60000 miles) on it.\\nA typical mileage for 1990 cars seem to be about 70000 km (or about 43K mi).\\nThe seller just informed me that when he brought the car in for certification\\nhe was told that the front break pads and the exhausts had to be replaced\\nto meet the legal standards.  (He said he will replace the components before\\nselling the car to me.)\\n\\nBeing copmletely ignorant to the technical stuff on cars, I don\\'t know\\nwhat this could mean...\\nIs 96K km about the time typical for replacing the above mentioned items?\\nOr is this an indication that the car was abused?\\nWould other things break down or have to be replaced soon?\\nThe seller told me that he used the car on the highway a lot, but,\\nI don\\'t know how to verify this...  I\\'ve seen the paint chipped away\\nin tiny dots in the front edge of the hood, though.\\n\\nAlthough the Maxima is an excellent car and the car is very clean and\\nwell kept, it\\'s currently out of warranty\\n(a similarly priced \\'90 Accord with 70K km will have 2 years or 30K km\\nworth of warranty left) and I don\\'t want to worry about paying for\\nany repair bills...\\nBut, I also need a car for 5 people...  \\n\\nWhen will the new Maxima come out, by the way?\\n\\nI would very much appreciate your input in this.\\nPlease reply by e-mail (preferred) or post in this newsgroup.\\nThanks!\\n\\nRyan\\n\\n\\n\\n========\\nRyan Kim\\nUniversity of Toronto, EECG, Computer Graphics    rkim@eecg.toronto.edu\\n\"Do not weave between traffic cones at road works.\"\\n                                    - from the new British Highway Code\\n                                           (Toronto Star April 3, 1993)\\n\\n'"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news.data[5553]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
